In [ ]:
%%HTML
<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>
In [ ]:
import pandas as pd

# custom library
from TextPreProcessor import TextInputProcessor


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

import re

import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

import os
import numpy as np
import matplotlib.pyplot as plt

from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)


import plotly.io as pio
pio.renderers.default = "notebook+pdf"
In [ ]:
def cleanText(text):
    # remove newline
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()
In [ ]:
df = pd.read_csv("spamham.csv")
MAPPING = { 1 : "spam", 0: "ham"}
In [ ]:
def mergeDfs(datasetPaths):
    df = pd.read_csv(datasetPaths[0])

    for i in range(1, len(datasetPaths)):
        tmp = pd.read_csv(datasetPaths[i])
        tmp = tmp.drop("Unnamed: 0", axis = 1)
        df = pd.concat([df, tmp])
    return df
In [ ]:
df = mergeDfs(['spamham.csv', 'spam_ham_dataset.csv', 'spamHamData.csv'])
df.head()
Out[ ]:
text spam label
0 Subject: naturally irresistible your corporate... 1 NaN
1 Subject: the stock trading gunslinger fanny i... 1 NaN
2 Subject: unbelievable new homes made easy im ... 1 NaN
3 Subject: 4 color printing special request add... 1 NaN
4 Subject: do not have money , get software cds ... 1 NaN
In [ ]:
#convert to lowercase
df["text"] = df["text"].apply(lambda x: cleanText(x[len("subject:"): ] ) )
In [ ]:
df.isna().sum()
Out[ ]:
text        0
spam        0
label    5728
dtype: int64
In [ ]:
df = df.drop("label", axis = 1)
df.head()
Out[ ]:
text spam
0 naturally irresistible your corporate identity... 1
1 the stock trading gunslinger fanny is merrill ... 1
2 unbelievable new homes made easy im wanting to... 1
3 4 color printing special request additional in... 1
4 do not have money , get software cds from here... 1
In [ ]:
df.groupby("spam").describe()
Out[ ]:
text
count unique top freq
spam
0 10583 10128 calpine daily gas nomination > ricky a . arche... 20
1 3368 3326 16
In [ ]:
fig = go.Figure(
    go.Bar(x = list(map(lambda x : MAPPING[x], set(df.spam))), y = df.groupby("spam").count().text.tolist())
)

fig.update_layout(
    title = "Distribution of Classes",
    yaxis_title = "Count",
    xaxis_title = "Class"
)

fig.show()
In [ ]:
# email filter with length > 10000
df = df[df["text"].apply(lambda x: len(x) < 10000)]
df.head()
Out[ ]:
text spam
0 naturally irresistible your corporate identity... 1
1 the stock trading gunslinger fanny is merrill ... 1
2 unbelievable new homes made easy im wanting to... 1
3 4 color printing special request additional in... 1
4 do not have money , get software cds from here... 1

Here, the dataset is imbalanced. So, when we use NN we may end with with a local minima

In [ ]:
# plot distrubution of message length of messages (histogram)

hist1 = go.Histogram(x = [len(row.text) for ind, row in df[df.spam == 1].iterrows()],
        nbinsx=8, name = "Spam Length")
hist2 = go.Histogram(x = [len(row.text) for ind, row in df[df.spam == 0].iterrows()], 
         nbinsx=8, name = "Ham Length")

fig = go.Figure(data = [hist1, hist2])

fig.update_layout(
    title = "Histogram of message length of Spam vs Non spam emails",
    xaxis_title = "Message length",
    yaxis_title = "Count of Range",
    legend_title="Type of Email",
)

fig.show()

Split dataset into train and test¶

Split in ratio 0.3

Use Naive Bayes to classify the data¶

Three Steps :

  1. first Vectorize the data
  2. Normalize email data using tfidf vectorizer
  3. Use a Support Vector Machine to classify emails
In [ ]:
tfidf_vec = TfidfVectorizer(smooth_idf=True,use_idf=True) 
textFeatures = tfidf_vec.fit_transform(df["text"])
In [ ]:
tfidf_df = pd.DataFrame(textFeatures.toarray(), columns = tfidf_vec.get_feature_names_out())
tfidf_df[4:10].T
Out[ ]:
4 5 6 7 8 9
00 0.0 0.0 0.0 0.0 0.000000 0.0
000 0.0 0.0 0.0 0.0 0.071134 0.0
0000 0.0 0.0 0.0 0.0 0.000000 0.0
000000 0.0 0.0 0.0 0.0 0.000000 0.0
00000000 0.0 0.0 0.0 0.0 0.000000 0.0
... ... ... ... ... ... ...
zymg 0.0 0.0 0.0 0.0 0.000000 0.0
zzmacmac 0.0 0.0 0.0 0.0 0.000000 0.0
zzn 0.0 0.0 0.0 0.0 0.000000 0.0
zzncacst 0.0 0.0 0.0 0.0 0.000000 0.0
zzzz 0.0 0.0 0.0 0.0 0.000000 0.0

35406 rows × 6 columns

In [ ]:
tfidf_vec.get_feature_names_out()
Out[ ]:
array(['00', '000', '0000', ..., 'zzn', 'zzncacst', 'zzzz'], dtype=object)

Split data into test and train¶

Use a split of 70% train and 30% train

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.spam, test_size=0.3, random_state=111) 
In [ ]:
print(X_train)
print(f"Training set : {X_train.shape}")
2477    times 2 filing units pat : recently , i talked...
3363    kwbt bio - tech signs letter of intent , gcm a...
3599    cialis , xanax , valium , viagra at low price ...
213     how are you today ? orchard m . guei . republi...
4246    global risk management operations recognizing ...
                              ...                        
1760    dave n out until july 5 th i will be taking a ...
4219    re : recruiting at cmu computational finance p...
4858    re : introduction i would be very happy to par...
4522    from 17 because paliourg , # valiumxanaxcialis...
1308    -list-admin@freshrpms.net wed oct 2 11:45:08 2...
Name: text, Length: 9657, dtype: object
Training set : (9657,)
In [ ]:
print(y_train)
2477    0
3363    1
3599    1
213     1
4246    0
       ..
1760    0
4219    0
4858    0
4522    1
1308    0
Name: spam, Length: 9657, dtype: int64

Create a Pipeline to send in text input and produce an SVM output¶

In [ ]:
spamPipe = Pipeline([ 
    ('text_preProcess', TextInputProcessor()),
    ('tfidf', TfidfVectorizer()),
    ('SVM', SVC(kernel='sigmoid', gamma=1.0))      
])
print(spamPipe)
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>),
                ('tfidf', TfidfVectorizer()),
                ('SVM', SVC(gamma=1.0, kernel='sigmoid'))])
In [ ]:
spamPipe.fit(X_train, y_train)
Out[ ]:
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>),
                ('tfidf', TfidfVectorizer()),
                ('SVM', SVC(gamma=1.0, kernel='sigmoid'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>),
                ('tfidf', TfidfVectorizer()),
                ('SVM', SVC(gamma=1.0, kernel='sigmoid'))])
<TextPreProcessor.TextInputProcessor object at 0x000001FF6A15AD90>
TfidfVectorizer()
SVC(gamma=1.0, kernel='sigmoid')
In [ ]:
# model metrics
def showMetrics(testData, truths, model):
    preds = model.predict(testData)
    cm = confusion_matrix(truths, preds)
    print("Confusion Matrix")
    print(cm)
    print("Classification report")
    print(classification_report(truths, preds, target_names = ["ham", "spam"]) )

    ax = sns.heatmap(cm, annot=True, fmt='d')
    ax.set(title = "HeatMap of Predictions", xlabel="Predicted", ylabel="Truth")
    plt.show()
In [ ]:
showMetrics(X_test, y_test, spamPipe)
Confusion Matrix
[[3111   26]
 [  17  985]]
Classification report
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      3137
        spam       0.97      0.98      0.98      1002

    accuracy                           0.99      4139
   macro avg       0.98      0.99      0.99      4139
weighted avg       0.99      0.99      0.99      4139

In [ ]:
# sample function in sklearn to print learning curves
from sklearn.model_selection import learning_curve    


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt
In [ ]:
plot_learning_curve(spamPipe, "Learning Curves (SVM, Sigmoid Kernel, $\gamma=1$)", X_train, y_train)
Out[ ]:
<module 'matplotlib.pyplot' from 'd:\\Python\\Python39\\lib\\site-packages\\matplotlib\\pyplot.py'>

Testing on outside data¶

In [ ]:
# test on random spam mail from inbox

examples = ["""
I am a senior official from the World Health Organization(WHO), I was instructed to contact you regarding on-going Compensation Grant Awards for Covid-19, approved by the World Health Organization alongside the United Nations as compensation payment/rewards for eligible beneficiaries.

Have you received your Grant Award payment of US$500,000 for COVID-19? If NO! Kindly be informed that you are among the lucky winners randomly selected to benefit from this relief program.

Contact the info below to reach Grant Award Officer-""",
"""
I am Charles Rettig, US Commissioner of Internal Revenue Services.
This is to inform you about the release of your overdue benefits fund
of $3,900,000.00 USD which was on hold for a long time. Your
consignment box of $3.9 Million United States Dollars has been
approved in your name by all the United States Federal enforcement law
for funds and is ready to deliver to you as soon as you reconfirm your
mailing address.
As this matter is urgent, I look forward to hearing from you as soon
as possible.

Please reply with your needed information such as
Full name..........
Address..........
Phone number .........
Country........
Occupation......

Regards. Charles Rettig.""", """Hi
Class "Post Analysis: BST, Heaps and Map" has been scheduled on 14 Oct 2022 07:00PM
Duration : 120 Mins
Mentor : jaydals0eo8""",
"""Link: https://attendee.gotowebinar.com/register/2135259703821289230

Welcome, All! In this webinar, we will discuss the following: 
* Cloud computing market will be worth $800 billion by 2025 
* 30,000+ Job openings for freshers in AWS 
* Career opportunities in AWS Cloud Computing 
* Avg. salary of Rs. 4.5 LPA 
* Continuous placement support
* Companies hiring 2022, 2021, 2020, and 2019 batch students 
* Why AWS is the hot domain in MNCs
When
Friday 21 Oct 2022 ⋅ 6pm – 7pm (India Standard Time - Kolkata)
Location
https://attendee.gotowebinar.com/register/2135259703821289230
View map
Organiser
yasmin.taj@ethnus.com"""]
In [ ]:
pred = spamPipe.predict(examples)
print(f"Predictions = {pred}")
Predictions = [1 1 0 0]
In [ ]:
# saving model
import joblib

with open('spam_model.pkl', 'wb') as f:
    joblib.dump(spamPipe, f)
In [ ]:
spamPipe1 = Pipeline([ 
    ('text_preProcess', TextInputProcessor()),
    ('tfidf', TfidfVectorizer()),
    ('SVM', RandomForestClassifier(n_estimators=15, random_state = 11))      
])
print(spamPipe1)
spamPipe1.fit(X_train, y_train)
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>),
                ('tfidf', TfidfVectorizer()),
                ('SVM',
                 RandomForestClassifier(n_estimators=15, random_state=11))])
Out[ ]:
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>),
                ('tfidf', TfidfVectorizer()),
                ('SVM',
                 RandomForestClassifier(n_estimators=15, random_state=11))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>),
                ('tfidf', TfidfVectorizer()),
                ('SVM',
                 RandomForestClassifier(n_estimators=15, random_state=11))])
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04EE0>
TfidfVectorizer()
RandomForestClassifier(n_estimators=15, random_state=11)
In [ ]:
showMetrics(X_test, y_test, spamPipe1)
Confusion Matrix
[[3105   32]
 [ 128  874]]
Classification report
              precision    recall  f1-score   support

         ham       0.96      0.99      0.97      3137
        spam       0.96      0.87      0.92      1002

    accuracy                           0.96      4139
   macro avg       0.96      0.93      0.95      4139
weighted avg       0.96      0.96      0.96      4139

In [ ]:
spamPipe2 = Pipeline([ 
    ('text_preProcess', TextInputProcessor()),
    ('tfidf', TfidfVectorizer()),
    ('SVM', KNeighborsClassifier(n_neighbors=50))      
])
print(spamPipe2)
spamPipe2.fit(X_train, y_train)
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>),
                ('tfidf', TfidfVectorizer()),
                ('SVM', KNeighborsClassifier(n_neighbors=50))])
Out[ ]:
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>),
                ('tfidf', TfidfVectorizer()),
                ('SVM', KNeighborsClassifier(n_neighbors=50))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('text_preProcess',
                 <TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>),
                ('tfidf', TfidfVectorizer()),
                ('SVM', KNeighborsClassifier(n_neighbors=50))])
<TextPreProcessor.TextInputProcessor object at 0x000001FF6BC04A90>
TfidfVectorizer()
KNeighborsClassifier(n_neighbors=50)
In [ ]:
showMetrics(X_test, y_test, spamPipe2)
Confusion Matrix
[[3051   86]
 [  60  942]]
Classification report
              precision    recall  f1-score   support

         ham       0.98      0.97      0.98      3137
        spam       0.92      0.94      0.93      1002

    accuracy                           0.96      4139
   macro avg       0.95      0.96      0.95      4139
weighted avg       0.97      0.96      0.96      4139